Competeing in the 2021 DSAC DVA competition!

Package Setup:

library(tidyverse)
library(janitor)
library(lubridate)
library(forcats)
library(gganimate)

Task: Create a plot using the folowing datasets…

  1. Christmas songs in the Billboard top 100 list during December from 1958 to 2017 (christmas_billboard_data.csv)

  2. Weather in Chicago on Christmas day from 1871 to 2018 (ChicagoWeatherChristmas.csv)

  3. The gifts and the quantity of gifts acquired each day in “12 days of Christmas” (12_Days_of_Christmas.csv)

Here are the datasets:

weather <- read_csv(file="datasets/ChicagoWeatherChristmas.csv") %>% clean_names()
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Year = col_double(),
##   `Fahrenheit High Temp` = col_double(),
##   `Fahrenheit Min Temp` = col_double(),
##   Precipitation = col_double(),
##   Snow = col_double(),
##   `White Christmas` = col_character(),
##   `Celcius Min Temp` = col_double(),
##   `Celcius High Temp` = col_double()
## )
songs <- read_csv("datasets/christmas_billboard_data.csv") %>% clean_names()
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   url = col_character(),
##   weekid = col_character(),
##   week_position = col_double(),
##   song = col_character(),
##   performer = col_character(),
##   songid = col_character(),
##   instance = col_double(),
##   previous_week_position = col_double(),
##   peak_position = col_double(),
##   weeks_on_chart = col_double(),
##   year = col_double(),
##   month = col_double(),
##   day = col_double()
## )

First let’s take a look at the weather

It should be easier to visualize this data by decade instead of each individual year.

After that I want to do things like:

weather_defined <- subset(weather, white_christmas!="Not Defined") %>%
  mutate(decade= year-(year %% 10))

weather_defined$white_christmas <- as.logical(weather_defined$white_christmas)
white_xmas <- weather_defined %>% group_by(decade) %>% 
  summarize(n = n(), 
            white_xmases=sum(white_christmas), 
            total_snow=sum(snow), 
            total_precip = sum(precipitation),
            percent_white =white_xmases/(white_xmases+n),
            ave_snow = mean(snow),
            sd_snow = sd(snow)
                            )
white_xmas
## # A tibble: 14 x 8
##    decade     n white_xmases total_snow total_precip percent_white ave_snow
##     <dbl> <int>        <int>      <dbl>        <dbl>         <dbl>    <dbl>
##  1   1880     4            2        3.3         0.3          0.333   0.825 
##  2   1890     6            1        0.3        NA            0.143   0.05  
##  3   1900     5            2        8.2         0.92         0.286   1.64  
##  4   1910     9            3        1.8         0.19         0.25    0.2   
##  5   1920     8            4        1.2         0.23         0.333   0.15  
##  6   1930     9            3        8.7        NA            0.25    0.967 
##  7   1940     8            5        3.8         0.75         0.385   0.475 
##  8   1950     7            3        9.4        NA            0.3     1.34  
##  9   1960     7            4        6.7         0.73         0.364   0.957 
## 10   1970     6            4        2.7         0.51         0.4     0.45  
## 11   1980     6            1        0.5         0.5          0.143   0.0833
## 12   1990     8            2        0.7        NA            0.2     0.0875
## 13   2000     8            4        2.9        NA            0.333   0.362 
## 14   2010     9            4        2.3         0.24         0.308   0.256 
## # … with 1 more variable: sd_snow <dbl>

Now let’s plot it in a festive way!

ggplot(white_xmas, aes(x=decade, y=total_snow)) + geom_line() + geom_point(aes(size=n), color="white") + theme_minimal() + scale_x_continuous(breaks=unique(white_xmas$decade)) + labs(x="Decade", y="Total Chirstmas Snowfall (inches)", size="# White Christmases") +
ggtitle("It Still Snows on Christmas in Chicago", subtitle = "...just not as much as it used to") +
  theme(plot.background = element_rect(fill="lightsteelblue3"),
        panel.grid.minor = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.major.y = element_line(linetype="dashed"),
        text = element_text(colour="white"),
        axis.text = element_text(colour="white")
        )

ggsave("StillSnows.jpg", height=5, width=8)

Now on to the holiday songs!

songs$song <- factor(songs$song)

One interesting metric is how far away from the actual date of Christmas each of these weeks are. Let’s normalize the time component to 12/25 and see what it looks like.

songs$week_date <- mdy(songs$weekid)
#Set all years to 2000 or 2001 if the month is January
for (i in 1:length(songs$week_date)){
  if (month(songs$week_date[i])==1){
    year(songs$week_date[i]) <- 2001 
  }
  else {
    year(songs$week_date[i]) <- 2000
  }
  
}
xmas <- mdy("12-25-2000")
songs <- mutate(songs, xmas_dist = as.numeric(difftime(week_date, xmas, units="days")))
summary(songs$xmas_dist)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -50.000  -7.500   3.000   2.364  12.000  37.000

Now summarize by decade again to decluter the plot

songs <- mutate(songs, decade= year-(year %% 10))
songs$decade <- factor(songs$decade, ordered = T)
summary(songs$decade)
## 1950 1960 1970 1980 1990 2000 2010 
##   21  144   44   35   24   50   69

Interesting in itself that there are so many more in the 1960s than any other decade

Below I am doing a few things. First, I want to calculate a single score to measure the “hit potential” of each song over these holiday months. I call this the “hit score”, and it is calculated by taking (100/peak_position) * weeks on the charts.

The higher the hit score, the higher and longer a particular song was on the charts for.

hit_score <- songs %>% group_by(performer, songid) %>% 
  summarise(peak_position=mean(peak_position), 
            weeks=mean(weeks_on_chart), 
            hit_score=(100/peak_position)*weeks) %>% 
  arrange(desc(hit_score)) %>% 
  ungroup()
## `summarise()` has grouped output by 'performer'. You can override using the `.groups` argument.
songs2 <- left_join(songs, select(hit_score, songid, hit_score), by="songid") 

#Need to fix the one repeat with EXACT same hit score
songs2[songs2$songid=="BelieveBrooks & Dunn",]$hit_score <- 34

Now I want to rank each of the songs by hit_score based on the year and ALL previous years. This way we can see which decades generated the most chart-topping songs

songs_list <- list()
years <- unique(songs2$year)
for (i in 1:length(years)) {
  songs_list[[i]] <- subset(songs2, year <=years[i]) %>%
    ungroup() %>%
    distinct(songid, .keep_all=T) %>%
    #group_by(songid) %>%
    mutate(rank=rank(-hit_score),
           rank_year = years[[i]]) %>%
    subset(rank<25) %>%
    select(songid, performer, song, hit_score, year,decade,rank, rank_year)
}

songs_big_df <- Reduce(
  function(x, y, ...) merge(x, y, all = TRUE, ...),
  songs_list
)

songs_big_df <- mutate(songs_big_df, 
                       label = paste(song, performer, year, sep=" "))

And animate it all over time!

color_pal <- c("1950"="#3C4930", "1960"="#8AAEE2", "1970"="#A6001D",
               "1980"="#D00016","1990"="#D7BA5C",
               "2000"="#BF5E73","2010"="#AF8952")


animation <- ggplot(songs_big_df, aes(rank, group = songid,
                          fill = as.factor(decade), color = as.factor(decade))) +
  geom_tile(aes(y = hit_score/2,
                height = hit_score,
                width = 0.9), alpha = 0.8, color = NA) +
  geom_text(aes(y = 0, label = paste(label, " ")), vjust = 0.1, hjust = -0.2, color="black") +
  coord_flip(clip = "off", expand = FALSE) +
  scale_x_reverse() +
  scale_fill_manual(values=color_pal) +
  guides(color = FALSE, fill = FALSE) +
  labs(y="Hit Score (peak position * weeks on chart)") +
  theme(axis.line=element_blank(),
        #text = element_text(family = "Varela Round"),
        #axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        #axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        legend.position="none",
        panel.background=element_blank(),
        panel.border=element_blank(),
        panel.grid.major=element_blank(),
        panel.grid.minor=element_blank(),
        panel.grid.major.x = element_line( size=.1, color="grey" ),
        panel.grid.minor.x = element_line( size=.1, color="grey" ),
        plot.title=element_text(size=25, 
                                #hjust=0.5, 
                                face="bold", colour="firebrick",
                                #vjust=-1
                                ),
        plot.subtitle=element_text(size=18, 
                                   #hjust=0.5, 
                                   face="italic", 
                                   color="darkgreen"),
        plot.caption =element_text(size=8, 
                                   #hjust=0.5, 
                                   face="italic", color="darkgrey"),
        plot.background=element_rect(fill="#FFEDE1"
                                       #"#DDE4F8"
                                     ),
        #plot.margin = margin(2,2, 2, 4, "cm")
        ) +
  transition_states(rank_year, transition_length = 4, state_length = 1) +
  enter_fade() + 
  exit_fade() +
  labs(title = 'Top 25 Holiday Hits Over the Years',
    subtitle = 'Year: {closest_state}',
    caption = "Data: Hot 100 singles chart from Billboard.com")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
#animation
animate(animation, duration=30,fps=20, width=450, height=600, 
        renderer=gifski_renderer("holidayhits.gif"))

Now I’ll use my “Days to Christmas” variable to make some plots

top_song <- subset(songs, songid=="This One's For The ChildrenNew Kids On The Block"|
                     songid =="MistletoeJustin Bieber" |
                     songid == "AmenThe Impressions" |
                     songid == "All I Want For Christmas Is YouMariah Carey" |
                     songid =="Same Old Lang SyneDan Fogelberg") %>%
            mutate(label = paste(song, performer, sep=" "))
top_song <- top_song %>% group_by(song)
ggplot(top_song, aes(x=xmas_dist, y=week_position, group=song)) + geom_line(aes(color=label)) + scale_y_reverse() + theme_minimal() + 
  labs(x="Days to Christmas", y = "Chart Position") +
  transition_reveal(xmas_dist, )

This project was fun! I learned a few things along the way…namely:

Until next time!